The input files for this analysis are .mirna format files that are generated through miraligner. PCR duplicates are removed using seqcluster.
suppressMessages(
c(library(isomiRs),
library(DESeq2),
library(tximport),
library(pheatmap),
library(gridExtra),
library(grid),
library(ggplot2),
library(lattice),
library(reshape),
library(mixOmics),
library(gplots),
library(RColorBrewer),
library(readr),
library(dplyr),
library(data.table),
library(tidyverse),
library(rtracklayer),
library(Biostrings),
library(Rsubread),
library(ggrepel),
library(CLIPanalyze),
library(plotly)
)
)
## [1] "isomiRs" "SummarizedExperiment" "DelayedArray"
## [4] "matrixStats" "Biobase" "GenomicRanges"
## [7] "GenomeInfoDb" "IRanges" "S4Vectors"
## [10] "BiocGenerics" "parallel" "stats4"
## [13] "DiscriMiner" "stats" "graphics"
## [16] "grDevices" "utils" "datasets"
## [19] "methods" "base" "DESeq2"
## [22] "isomiRs" "SummarizedExperiment" "DelayedArray"
## [25] "matrixStats" "Biobase" "GenomicRanges"
## [28] "GenomeInfoDb" "IRanges" "S4Vectors"
## [31] "BiocGenerics" "parallel" "stats4"
## [34] "DiscriMiner" "stats" "graphics"
## [37] "grDevices" "utils" "datasets"
## [40] "methods" "base" "tximport"
## [43] "DESeq2" "isomiRs" "SummarizedExperiment"
## [46] "DelayedArray" "matrixStats" "Biobase"
## [49] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [52] "S4Vectors" "BiocGenerics" "parallel"
## [55] "stats4" "DiscriMiner" "stats"
## [58] "graphics" "grDevices" "utils"
## [61] "datasets" "methods" "base"
## [64] "pheatmap" "tximport" "DESeq2"
## [67] "isomiRs" "SummarizedExperiment" "DelayedArray"
## [70] "matrixStats" "Biobase" "GenomicRanges"
## [73] "GenomeInfoDb" "IRanges" "S4Vectors"
## [76] "BiocGenerics" "parallel" "stats4"
## [79] "DiscriMiner" "stats" "graphics"
## [82] "grDevices" "utils" "datasets"
## [85] "methods" "base" "gridExtra"
## [88] "pheatmap" "tximport" "DESeq2"
## [91] "isomiRs" "SummarizedExperiment" "DelayedArray"
## [94] "matrixStats" "Biobase" "GenomicRanges"
## [97] "GenomeInfoDb" "IRanges" "S4Vectors"
## [100] "BiocGenerics" "parallel" "stats4"
## [103] "DiscriMiner" "stats" "graphics"
## [106] "grDevices" "utils" "datasets"
## [109] "methods" "base" "grid"
## [112] "gridExtra" "pheatmap" "tximport"
## [115] "DESeq2" "isomiRs" "SummarizedExperiment"
## [118] "DelayedArray" "matrixStats" "Biobase"
## [121] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [124] "S4Vectors" "BiocGenerics" "parallel"
## [127] "stats4" "DiscriMiner" "stats"
## [130] "graphics" "grDevices" "utils"
## [133] "datasets" "methods" "base"
## [136] "ggplot2" "grid" "gridExtra"
## [139] "pheatmap" "tximport" "DESeq2"
## [142] "isomiRs" "SummarizedExperiment" "DelayedArray"
## [145] "matrixStats" "Biobase" "GenomicRanges"
## [148] "GenomeInfoDb" "IRanges" "S4Vectors"
## [151] "BiocGenerics" "parallel" "stats4"
## [154] "DiscriMiner" "stats" "graphics"
## [157] "grDevices" "utils" "datasets"
## [160] "methods" "base" "lattice"
## [163] "ggplot2" "grid" "gridExtra"
## [166] "pheatmap" "tximport" "DESeq2"
## [169] "isomiRs" "SummarizedExperiment" "DelayedArray"
## [172] "matrixStats" "Biobase" "GenomicRanges"
## [175] "GenomeInfoDb" "IRanges" "S4Vectors"
## [178] "BiocGenerics" "parallel" "stats4"
## [181] "DiscriMiner" "stats" "graphics"
## [184] "grDevices" "utils" "datasets"
## [187] "methods" "base" "reshape"
## [190] "lattice" "ggplot2" "grid"
## [193] "gridExtra" "pheatmap" "tximport"
## [196] "DESeq2" "isomiRs" "SummarizedExperiment"
## [199] "DelayedArray" "matrixStats" "Biobase"
## [202] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [205] "S4Vectors" "BiocGenerics" "parallel"
## [208] "stats4" "DiscriMiner" "stats"
## [211] "graphics" "grDevices" "utils"
## [214] "datasets" "methods" "base"
## [217] "mixOmics" "MASS" "reshape"
## [220] "lattice" "ggplot2" "grid"
## [223] "gridExtra" "pheatmap" "tximport"
## [226] "DESeq2" "isomiRs" "SummarizedExperiment"
## [229] "DelayedArray" "matrixStats" "Biobase"
## [232] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [235] "S4Vectors" "BiocGenerics" "parallel"
## [238] "stats4" "DiscriMiner" "stats"
## [241] "graphics" "grDevices" "utils"
## [244] "datasets" "methods" "base"
## [247] "gplots" "mixOmics" "MASS"
## [250] "reshape" "lattice" "ggplot2"
## [253] "grid" "gridExtra" "pheatmap"
## [256] "tximport" "DESeq2" "isomiRs"
## [259] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [262] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [265] "IRanges" "S4Vectors" "BiocGenerics"
## [268] "parallel" "stats4" "DiscriMiner"
## [271] "stats" "graphics" "grDevices"
## [274] "utils" "datasets" "methods"
## [277] "base" "RColorBrewer" "gplots"
## [280] "mixOmics" "MASS" "reshape"
## [283] "lattice" "ggplot2" "grid"
## [286] "gridExtra" "pheatmap" "tximport"
## [289] "DESeq2" "isomiRs" "SummarizedExperiment"
## [292] "DelayedArray" "matrixStats" "Biobase"
## [295] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [298] "S4Vectors" "BiocGenerics" "parallel"
## [301] "stats4" "DiscriMiner" "stats"
## [304] "graphics" "grDevices" "utils"
## [307] "datasets" "methods" "base"
## [310] "readr" "RColorBrewer" "gplots"
## [313] "mixOmics" "MASS" "reshape"
## [316] "lattice" "ggplot2" "grid"
## [319] "gridExtra" "pheatmap" "tximport"
## [322] "DESeq2" "isomiRs" "SummarizedExperiment"
## [325] "DelayedArray" "matrixStats" "Biobase"
## [328] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [331] "S4Vectors" "BiocGenerics" "parallel"
## [334] "stats4" "DiscriMiner" "stats"
## [337] "graphics" "grDevices" "utils"
## [340] "datasets" "methods" "base"
## [343] "dplyr" "readr" "RColorBrewer"
## [346] "gplots" "mixOmics" "MASS"
## [349] "reshape" "lattice" "ggplot2"
## [352] "grid" "gridExtra" "pheatmap"
## [355] "tximport" "DESeq2" "isomiRs"
## [358] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [361] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [364] "IRanges" "S4Vectors" "BiocGenerics"
## [367] "parallel" "stats4" "DiscriMiner"
## [370] "stats" "graphics" "grDevices"
## [373] "utils" "datasets" "methods"
## [376] "base" "data.table" "dplyr"
## [379] "readr" "RColorBrewer" "gplots"
## [382] "mixOmics" "MASS" "reshape"
## [385] "lattice" "ggplot2" "grid"
## [388] "gridExtra" "pheatmap" "tximport"
## [391] "DESeq2" "isomiRs" "SummarizedExperiment"
## [394] "DelayedArray" "matrixStats" "Biobase"
## [397] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [400] "S4Vectors" "BiocGenerics" "parallel"
## [403] "stats4" "DiscriMiner" "stats"
## [406] "graphics" "grDevices" "utils"
## [409] "datasets" "methods" "base"
## [412] "forcats" "stringr" "purrr"
## [415] "tidyr" "tibble" "tidyverse"
## [418] "data.table" "dplyr" "readr"
## [421] "RColorBrewer" "gplots" "mixOmics"
## [424] "MASS" "reshape" "lattice"
## [427] "ggplot2" "grid" "gridExtra"
## [430] "pheatmap" "tximport" "DESeq2"
## [433] "isomiRs" "SummarizedExperiment" "DelayedArray"
## [436] "matrixStats" "Biobase" "GenomicRanges"
## [439] "GenomeInfoDb" "IRanges" "S4Vectors"
## [442] "BiocGenerics" "parallel" "stats4"
## [445] "DiscriMiner" "stats" "graphics"
## [448] "grDevices" "utils" "datasets"
## [451] "methods" "base" "rtracklayer"
## [454] "forcats" "stringr" "purrr"
## [457] "tidyr" "tibble" "tidyverse"
## [460] "data.table" "dplyr" "readr"
## [463] "RColorBrewer" "gplots" "mixOmics"
## [466] "MASS" "reshape" "lattice"
## [469] "ggplot2" "grid" "gridExtra"
## [472] "pheatmap" "tximport" "DESeq2"
## [475] "isomiRs" "SummarizedExperiment" "DelayedArray"
## [478] "matrixStats" "Biobase" "GenomicRanges"
## [481] "GenomeInfoDb" "IRanges" "S4Vectors"
## [484] "BiocGenerics" "parallel" "stats4"
## [487] "DiscriMiner" "stats" "graphics"
## [490] "grDevices" "utils" "datasets"
## [493] "methods" "base" "Biostrings"
## [496] "XVector" "rtracklayer" "forcats"
## [499] "stringr" "purrr" "tidyr"
## [502] "tibble" "tidyverse" "data.table"
## [505] "dplyr" "readr" "RColorBrewer"
## [508] "gplots" "mixOmics" "MASS"
## [511] "reshape" "lattice" "ggplot2"
## [514] "grid" "gridExtra" "pheatmap"
## [517] "tximport" "DESeq2" "isomiRs"
## [520] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [523] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [526] "IRanges" "S4Vectors" "BiocGenerics"
## [529] "parallel" "stats4" "DiscriMiner"
## [532] "stats" "graphics" "grDevices"
## [535] "utils" "datasets" "methods"
## [538] "base" "Rsubread" "Biostrings"
## [541] "XVector" "rtracklayer" "forcats"
## [544] "stringr" "purrr" "tidyr"
## [547] "tibble" "tidyverse" "data.table"
## [550] "dplyr" "readr" "RColorBrewer"
## [553] "gplots" "mixOmics" "MASS"
## [556] "reshape" "lattice" "ggplot2"
## [559] "grid" "gridExtra" "pheatmap"
## [562] "tximport" "DESeq2" "isomiRs"
## [565] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [568] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [571] "IRanges" "S4Vectors" "BiocGenerics"
## [574] "parallel" "stats4" "DiscriMiner"
## [577] "stats" "graphics" "grDevices"
## [580] "utils" "datasets" "methods"
## [583] "base" "ggrepel" "Rsubread"
## [586] "Biostrings" "XVector" "rtracklayer"
## [589] "forcats" "stringr" "purrr"
## [592] "tidyr" "tibble" "tidyverse"
## [595] "data.table" "dplyr" "readr"
## [598] "RColorBrewer" "gplots" "mixOmics"
## [601] "MASS" "reshape" "lattice"
## [604] "ggplot2" "grid" "gridExtra"
## [607] "pheatmap" "tximport" "DESeq2"
## [610] "isomiRs" "SummarizedExperiment" "DelayedArray"
## [613] "matrixStats" "Biobase" "GenomicRanges"
## [616] "GenomeInfoDb" "IRanges" "S4Vectors"
## [619] "BiocGenerics" "parallel" "stats4"
## [622] "DiscriMiner" "stats" "graphics"
## [625] "grDevices" "utils" "datasets"
## [628] "methods" "base" "CLIPanalyze"
## [631] "GenomicAlignments" "Rsamtools" "GenomicFeatures"
## [634] "AnnotationDbi" "plyr" "ggrepel"
## [637] "Rsubread" "Biostrings" "XVector"
## [640] "rtracklayer" "forcats" "stringr"
## [643] "purrr" "tidyr" "tibble"
## [646] "tidyverse" "data.table" "dplyr"
## [649] "readr" "RColorBrewer" "gplots"
## [652] "mixOmics" "MASS" "reshape"
## [655] "lattice" "ggplot2" "grid"
## [658] "gridExtra" "pheatmap" "tximport"
## [661] "DESeq2" "isomiRs" "SummarizedExperiment"
## [664] "DelayedArray" "matrixStats" "Biobase"
## [667] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [670] "S4Vectors" "BiocGenerics" "parallel"
## [673] "stats4" "DiscriMiner" "stats"
## [676] "graphics" "grDevices" "utils"
## [679] "datasets" "methods" "base"
## [682] "plotly" "CLIPanalyze" "GenomicAlignments"
## [685] "Rsamtools" "GenomicFeatures" "AnnotationDbi"
## [688] "plyr" "ggrepel" "Rsubread"
## [691] "Biostrings" "XVector" "rtracklayer"
## [694] "forcats" "stringr" "purrr"
## [697] "tidyr" "tibble" "tidyverse"
## [700] "data.table" "dplyr" "readr"
## [703] "RColorBrewer" "gplots" "mixOmics"
## [706] "MASS" "reshape" "lattice"
## [709] "ggplot2" "grid" "gridExtra"
## [712] "pheatmap" "tximport" "DESeq2"
## [715] "isomiRs" "SummarizedExperiment" "DelayedArray"
## [718] "matrixStats" "Biobase" "GenomicRanges"
## [721] "GenomeInfoDb" "IRanges" "S4Vectors"
## [724] "BiocGenerics" "parallel" "stats4"
## [727] "DiscriMiner" "stats" "graphics"
## [730] "grDevices" "utils" "datasets"
## [733] "methods" "base"
Set working directory to the folder that contains only gene count .mirna files
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/miRNA_Analysis/Gene Counts/HEAP-CLIP_09282019_miRNA")
inDir = getwd()
mirFiles = list.files(inDir, pattern=".mirna$", full.names = TRUE)
basename(mirFiles)
## [1] "HVA4_miRNA_LIB043893_GEN00166438_R1_barcode.cutadapt.mirna"
## [2] "HVA5_miRNA_LIB043893_GEN00166441_R1_barcode.cutadapt.mirna"
## [3] "HVA6_miRNA_LIB043893_GEN00166444_R1_barcode.cutadapt.mirna"
## [4] "HVAK4_miRNA_LIB043893_GEN00166447_R1_barcode.cutadapt.mirna"
## [5] "HVAK5_miRNA_LIB043893_GEN00166450_R1_barcode.cutadapt.mirna"
## [6] "HVAK6_miRNA_LIB043893_GEN00166453_R1_barcode.cutadapt.mirna"
isomiRsampletable <- data.frame(
row.names = c('HVA_4_miRNA','HVA_5_miRNA','HVA_6_miRNA','HVAK_4_miRNA','HVAK_5_miRNA', 'HVAK_6_miRNA'),
condition = c('control','control','control','experimental','experimental', 'experimental'),
libType = c('single-end','single-end','single-end','single-end','single-end','single-end')
)
ids <- IsomirDataSeqFromFiles(mirFiles,
coldata = isomiRsampletable,
design = ~ condition)
dir.create("PDF_figure", showWarnings = FALSE)
isoPlot(ids, type="all")
## Ussing 905 isomiRs.
pdf("PDF_figure/isoPlot.pdf",
width = 8,
height = 6)
isoPlot(ids, type="all")
## Ussing 905 isomiRs.
dev.off()
## quartz_off_screen
## 2
# overview of the counts
head(counts(ids))
## HVA_4_miRNA HVA_5_miRNA HVA_6_miRNA HVAK_4_miRNA HVAK_5_miRNA
## mmu-let-7a-5p 1864 303 1510 2418 2010
## mmu-let-7b-3p 5 9 7 5 7
## mmu-let-7b-5p 5724 6348 5986 6298 5846
## mmu-let-7c-5p 3595 3365 3612 3721 3583
## mmu-let-7d-3p 239 597 504 349 321
## mmu-let-7d-5p 531 152 489 588 569
## HVAK_6_miRNA
## mmu-let-7a-5p 2096
## mmu-let-7b-3p 3
## mmu-let-7b-5p 5955
## mmu-let-7c-5p 3775
## mmu-let-7d-3p 343
## mmu-let-7d-5p 544
# output the count matrix
ids = isoNorm(ids, formula = ~ condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
rawCountTable <- as.data.frame(counts(ids, norm = TRUE))
# normalization is doing using rlog from DESeq2
pheatmap(counts(ids, norm=TRUE),
annotation_col = data.frame(colData(ids)[,1,drop=FALSE]),
show_rownames = FALSE, scale="row")
pdf("PDF_figure/Pheatmap.pdf",
width = 5,
height = 4)
pheatmap(counts(ids, norm=TRUE),
annotation_col = data.frame(colData(ids)[,1,drop=FALSE]),
show_rownames = FALSE, scale="row")
dev.off()
## pdf
## 3
# Perform differential analysis
dds_analysis <- isoDE(ids, formula = ~condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# Get a DESeq object using the count matrix
dds <- DESeqDataSetFromMatrix(counts(ids),
colData(ids), design = ~ condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_norm <- estimateSizeFactors(dds)
# transform the dataset for MA plot and PCA plotting
dds_transform <- varianceStabilizingTransformation(dds)
plotMA(dds_analysis)
pdf("PDF_figure/QC_MAPlot.pdf",
width = 5,
height = 4)
plotMA(dds_analysis)
dev.off()
## quartz_off_screen
## 2
plotPCA(dds_transform) +
geom_text(aes(label=name), vjust = 2, hjust = -0.1) +
xlim(-12,30) +ylim(-20,15)
pdf("PDF_figure/QC_PCAPlot.pdf",
width = 6,
height = 4)
plotPCA(dds_transform) +
geom_text(aes(label=name), vjust = 2, hjust = -0.1) +
xlim(-12,30) +ylim(-20,15)
dev.off()
## quartz_off_screen
## 2
#Boxplots
pseudoCount = log2(rawCountTable + 1)
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
## Warning in melt(pseudoCount, variable_name = "Samples"): The melt generic in
## data.table has been passed a data.frame and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(pseudoCount). In the next version, this warning
## will become an error.
df <- data.frame(df, Condition = substr(df$variable,1,4))
ggplot(df, aes(x=variable, y=value, fill = Condition)) + geom_boxplot() + xlab("") +
ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
pdf("PDF_figure/QC_BoxPlot.pdf",
width = 5,
height = 4)
ggplot(df, aes(x=variable, y=value, fill = Condition)) + geom_boxplot() + xlab("") +
ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
dev.off()
## quartz_off_screen
## 2
There is good seperation between HVA and HVAK samples. I can proceed to mix the miRNA datasets from HEAP-CLIP_052119 and HEAP-CLIP_09282019.
Set working directory to the folder that contains only gene count .mirna files
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/miRNA_Analysis/Gene Counts")
inDir = getwd()
mirFiles = list.files(inDir, pattern=".mirna$", full.names = TRUE)
basename(mirFiles)
## [1] "HVA1_miRNA_LIB041590_GEN00154970_R1_barcode.cutadapt.mirna"
## [2] "HVA2_miRNA_LIB041590_GEN00154973_R1_barcode.cutadapt.mirna"
## [3] "HVA3_miRNA_LIB041590_GEN00154976_R1_barcode.cutadapt.mirna"
## [4] "HVA4_miRNA_LIB043893_GEN00166438_R1_barcode.cutadapt.mirna"
## [5] "HVA5_miRNA_LIB043893_GEN00166441_R1_barcode.cutadapt.mirna"
## [6] "HVA6_miRNA_LIB043893_GEN00166444_R1_barcode.cutadapt.mirna"
## [7] "HVAK1_miRNA_LIB041590_GEN00154979_R1_barcode.cutadapt.mirna"
## [8] "HVAK2_miRNA_LIB041590_GEN00154982_R1_barcode.cutadapt.mirna"
## [9] "HVAK3_miRNA_LIB041590_GEN00154985_R1_barcode.cutadapt.mirna"
## [10] "HVAK4_miRNA_LIB043893_GEN00166447_R1_barcode.cutadapt.mirna"
## [11] "HVAK5_miRNA_LIB043893_GEN00166450_R1_barcode.cutadapt.mirna"
## [12] "HVAK6_miRNA_LIB043893_GEN00166453_R1_barcode.cutadapt.mirna"
isomiRsampletable <- data.frame(
row.names = c('HVA_1_miR','HVA_2_miR','HVA_3_miR','HVA_4_miR','HVA_5_miR','HVA_6_miR','HVAK_1_miR','HVAK_2_miR','HVAK_3_miR','HVAK_4_miR','HVAK_5_miR','HVAK_6_miR'),
condition = c('control','control','control','control','control','control','experimental', 'experimental', 'experimental','experimental', 'experimental', 'experimental'),
libType = c('single-end','single-end','single-end','single-end','single-end','single-end','single-end','single-end','single-end','single-end','single-end','single-end'),
batch = c('1','1','1','2','2','2','1','1','1','2','2','2')
)
ids <- IsomirDataSeqFromFiles(mirFiles,
coldata = isomiRsampletable,
design = ~ batch + condition)
isoPlot(ids, type="all")
## Ussing 987 isomiRs.
pdf("PDF_figure/isoPlot_merged.pdf",
width = 8,
height = 6)
isoPlot(ids, type="all")
## Ussing 987 isomiRs.
dev.off()
## quartz_off_screen
## 2
# overview of the counts
head(counts(ids))
## HVA_1_miR HVA_2_miR HVA_3_miR HVA_4_miR HVA_5_miR HVA_6_miR
## mmu-let-7a-5p 1664 881 1036 1864 303 1510
## mmu-let-7b-3p 8 7 8 5 9 7
## mmu-let-7b-5p 4388 3056 3177 5724 6348 5986
## mmu-let-7c-5p 2577 2105 2226 3595 3365 3612
## mmu-let-7d-3p 214 166 204 239 597 504
## mmu-let-7d-5p 506 306 425 531 152 489
## HVAK_1_miR HVAK_2_miR HVAK_3_miR HVAK_4_miR HVAK_5_miR HVAK_6_miR
## mmu-let-7a-5p 1445 1226 1258 2418 2010 2096
## mmu-let-7b-3p 6 8 12 5 7 3
## mmu-let-7b-5p 3451 3678 3555 6298 5846 5955
## mmu-let-7c-5p 2366 2433 2238 3721 3583 3775
## mmu-let-7d-3p 174 194 183 349 321 343
## mmu-let-7d-5p 380 393 362 588 569 544
# output the count matrix
rawCountTable <- as.data.frame(counts(ids))
# Get a DESeq object using the count matrix
dds <- DESeqDataSetFromMatrix(rawCountTable,
colData(ids), design = ~ batch + condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_analysis <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
MA plot was generated.
plotMA(dds_analysis)
pdf("PDF_figure/QC_MAPlot_merged.pdf",
width = 5,
height = 4)
plotMA(dds_analysis)
dev.off()
## quartz_off_screen
## 2
Data is transformed and pseudocount is calculated.
dds_norm <- estimateSizeFactors(dds)
rawCountTable <- as.data.frame(counts(dds_norm, normalized = TRUE))
rawCountTable_no_norm <- as.data.frame(counts(dds_norm))
pseudoCount = log2(rawCountTable + 1)
grid.arrange(
ggplot(pseudoCount, aes(x= HVA_1_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_1_miR"),
ggplot(pseudoCount, aes(x= HVA_2_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_2_miR"),
ggplot(pseudoCount, aes(x= HVA_3_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_3_miR"),
ggplot(pseudoCount, aes(x= HVA_4_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_4_miR"),
ggplot(pseudoCount, aes(x= HVA_5_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_5_miR"),
ggplot(pseudoCount, aes(x= HVA_6_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_6_miR"),
ggplot(pseudoCount, aes(x= HVAK_1_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_1_miR"),
ggplot(pseudoCount, aes(x= HVAK_2_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_2_miR"),
ggplot(pseudoCount, aes(x= HVAK_3_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_3_miR"),
ggplot(pseudoCount, aes(x= HVAK_4_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_4_miR"),
ggplot(pseudoCount, aes(x= HVAK_5_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_5_miR"),
ggplot(pseudoCount, aes(x= HVAK_6_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_6_miR"),
nrow = 3)
pdf("PDF_figure/QC_histogram.pdf",
width = 10,
height = 8)
grid.arrange(
ggplot(pseudoCount, aes(x= HVA_1_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_1_miR"),
ggplot(pseudoCount, aes(x= HVA_2_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_2_miR"),
ggplot(pseudoCount, aes(x= HVA_3_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_3_miR"),
ggplot(pseudoCount, aes(x= HVA_4_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_4_miR"),
ggplot(pseudoCount, aes(x= HVA_5_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_5_miR"),
ggplot(pseudoCount, aes(x= HVA_6_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVA_6_miR"),
ggplot(pseudoCount, aes(x= HVAK_1_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_1_miR"),
ggplot(pseudoCount, aes(x= HVAK_2_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_2_miR"),
ggplot(pseudoCount, aes(x= HVAK_3_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_3_miR"),
ggplot(pseudoCount, aes(x= HVAK_4_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_4_miR"),
ggplot(pseudoCount, aes(x= HVAK_5_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_5_miR"),
ggplot(pseudoCount, aes(x= HVAK_6_miR)) + xlab(expression(log[2](count + 1))) + ylab("Number of miRNAs") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "HVAK_6_miR"),
nrow = 3)
dev.off()
## quartz_off_screen
## 2
Check on the gene count distribution across all genes.
#Boxplots
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
## Warning in melt(pseudoCount, variable_name = "Samples"): The melt generic in
## data.table has been passed a data.frame and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace like reshape2::melt(pseudoCount). In the next version, this warning
## will become an error.
df <- data.frame(df, Condition = substr(df$variable,1,4))
ggplot(df, aes(x=variable, y=value, fill = Condition)) + geom_boxplot() + xlab("") +
ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
pdf("PDF_figure/QC_BoxPlot_merged.pdf",
width = 5,
height = 4)
ggplot(df, aes(x=variable, y=value, fill = Condition)) + geom_boxplot() + xlab("") +
ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
dev.off()
## quartz_off_screen
## 2
#Histograms and density plots
ggplot(df, aes(x=value, colour = variable, fill = variable)) + ylim(c(0, 0.25)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab(expression(log[2](count+1)))
pdf("PDF_figure/QC_densityPlot_merged.pdf",
width = 8,
height = 6)
ggplot(df, aes(x=value, colour = variable, fill = variable)) + ylim(c(0, 0.25)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab(expression(log[2](count+1)))
dev.off()
## quartz_off_screen
## 2
MA plots are used to check for imbalance in sequencing depth between samples of the same condition. I did not generate MA plot for all library pairs. But the example pairs I selected show that there are imbalance in sequencing depth, but the imbalance is quite random and this is common in RNA-Seq datasets.
## HVA1 vs HVA2
x = pseudoCount[, 1]
y = pseudoCount[, 2]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "HVA_1_miR vs HVA_2_miR"))
## `geom_smooth()` using formula 'y ~ x'
## HVA1 vs HVA3
x = pseudoCount[, 1]
y = pseudoCount[, 3]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "HVA_1_miR vs HVA_3_miR"))
## `geom_smooth()` using formula 'y ~ x'
## HVA1 vs HVAK1
x = pseudoCount[, 1]
y = pseudoCount[, 4]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "HVA_1_miR vs HVAK_1_miR"))
## `geom_smooth()` using formula 'y ~ x'
## HVA1 vs HVAK2
x = pseudoCount[, 1]
y = pseudoCount[, 5]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "HVA_1_miR vs HVAK_2_miR"))
## `geom_smooth()` using formula 'y ~ x'
## HVA1 vs HVAK3
x = pseudoCount[, 1]
y = pseudoCount[, 6]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "HVA_1_miR vs HVAK_3_miR"))
## `geom_smooth()` using formula 'y ~ x'
##### Clustering of the sample-to-sample distances This is the sanity check step to confirm that under a un-supervised clustering, HF and HFK samples will cluster together. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named
Hierchical Clustering.pnf
dds_transform <- varianceStabilizingTransformation(dds)
rawCountTable_transform <- as.data.frame(assay(dds_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf("PDF_figure/Hierchical_Clustering.pdf",
width = 6,
height = 6)
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
dev.off()
## quartz_off_screen
## 2
Final output is following:
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
plotPCA(dds_transform, intgroup = "condition", ntop = 500) +
geom_text(aes(label=name), vjust = 2, hjust = -0.1) + xlim(-7,20) + ylim(-10,10)
pdf("PDF_figure/QC_PCAPlot_merged.pdf",
width = 6,
height = 4)
plotPCA(dds_transform, intgroup = "condition", ntop = 500) +
geom_text(aes(label=name), vjust = 2, hjust = -0.1) + xlim(-7,20) + ylim(-10,10)
dev.off()
## quartz_off_screen
## 2
dds_result <- results(dds_analysis, contrast = c("condition", "experimental", "control"))
dds_result <- lfcShrink(dds_analysis, contrast = c("condition", "experimental", "control"), res = dds_result, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
write.csv(rawCountTable, "miRNA_counts.csv")
write.csv(as.data.frame(dds_result), "miRNA_de.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
suppressMessages(library(mosaic))
dif_analysis <- as.data.frame(results(dds_analysis))
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(pseudoCount)))
}
sig_count <- pseudoCount[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:18] <- zscore(as.numeric(sig_dif[i,7:18]))
}
my_palette <- colorRampPalette(c("blue", "white", "red"))(256)
heatmap_matrix <- as.matrix(sig_dif[,7:18])
png('HVAK vs HVA microRNASeq.png',
width = 600,
height = 1000,
res = 100,
pointsize = 8)
par(cex.main=1.1)
heatmap.2(heatmap_matrix,
main = "Differentially expressed\nmiRNA in colon tumor\npadj < 0.05",
density.info = "none",
key = TRUE,
lwid = c(3,7),
lhei = c(1,7),
labRow = rownames(heatmap_matrix),
col=my_palette,
margins = c(12,11),
symbreaks = TRUE,
trace = "none",
dendrogram = "row",
cexCol = 2,
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf("PDF_figure/Heatmap_merged.pdf",
width = 7,
height = 10)
par(cex.main=1.1)
heatmap.2(heatmap_matrix,
main = "Differentially expressed\nmiRNA in colon tumor\npadj < 0.05",
density.info = "none",
key = TRUE,
lwid = c(3,7),
lhei = c(1,7),
labRow = rownames(heatmap_matrix),
col=my_palette,
margins = c(12,11),
symbreaks = TRUE,
trace = "none",
dendrogram = "row",
cexCol = 2,
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
Final output is
# Scatter plot
dif_analysis$KrasG12D_mean <- rowMeans(pseudoCount[,7:12])
dif_analysis$KrasWT_mean <- rowMeans(pseudoCount[,1:6])
ggplot(dif_analysis, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
xlab("HVA_Average(log2)") + ylab("HVAK_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "HVA vs HVAK Scatter Plot")
pdf("PDF_figure/ScatterPlot_merged.pdf",
width = 5,
height = 4)
ggplot(dif_analysis, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
xlab("HVA_Average(log2)") + ylab("HVAK_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "HVA vs HVAK Scatter Plot")
dev.off()
## quartz_off_screen
## 2
# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "HVA vs HVAK MA Plot") + ylim(-4,4)
## Warning: Removed 1 rows containing missing values (geom_point).
pdf("PDF_figure/MAPlot_merged.pdf",
width = 5,
height = 4)
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "HVA vs HVAK MA Plot") + ylim(-4,4)
## Warning: Removed 1 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(padj,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "HVA vs HVAK Volcano Plot") + xlim(-4,4)
## Warning: Removed 164 rows containing missing values (geom_point).
pdf("PDF_figure/VolcanoPlot_merged.pdf",
width = 5,
height = 4)
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(padj,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "HVA vs HVAK Volcano Plot") + xlim(-4,4)
## Warning: Removed 164 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
Load miRNA database
mirna.info <- fread("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/miRNA_Analysis/Analysis/miR_Family_Info.txt", check.names = TRUE)
mirna.info <- mirna.info[Species.ID == 10090]
mirna.info[MiRBase.ID == "mmu-miR-124-3p.1", MiRBase.ID := "mmu-miR-124-3p"]
mirna.info[miR.family == "miR-124-3p.1", miR.family := "miR-124-3p"]
mirna.info[, seed := gsub("U", "T", Seed.m8)]
Load miRNA counts.
mirna.counts <- rawCountTable_no_norm
mirna.counts$miRNA <- rownames(mirna.counts)
mirna.counts <- as.data.table(mirna.counts)
mirna.counts$count <-
rowSums(mirna.counts[,-13])
mirna.counts$count.hva <-
rowSums(mirna.counts[, c("HVA_1_miR", "HVA_2_miR", "HVA_3_miR", "HVA_4_miR", "HVA_5_miR", "HVA_6_miR")])
mirna.counts$count.hvak <-
rowSums(mirna.counts[, c("HVAK_1_miR", "HVAK_2_miR", "HVAK_3_miR", "HVAK_4_miR", "HVAK_5_miR", "HVAK_6_miR")])
mirna.counts <- mirna.counts[order(-count)]
mirna.counts[, miRNA.shortname := miRNA]
which.to.change <- startsWith(mirna.counts$miRNA.shortname, "mmu-")
mirna.counts[which.to.change, ]$miRNA.shortname <-
sapply(strsplit(mirna.counts[which.to.change, ]$miRNA.shortname, "mmu-"),
"[", 2)
which.to.change <- endsWith(mirna.counts$miRNA.shortname, "-5p")
mirna.counts[which.to.change]$miRNA.shortname <-
sapply(strsplit(mirna.counts[which.to.change]$miRNA.shortname, "-5p"),
"[", 1)
which.to.change <- endsWith(mirna.counts$miRNA.shortname, "-3p")
mirna.counts[which.to.change]$miRNA.shortname <-
sapply(strsplit(mirna.counts[which.to.change]$miRNA.shortname, "-3p"),
"[", 1)
dds <- dds[rowSums(DESeq2::counts(dds)) > 200]
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotMA(dds)
mir.dif <- as.data.frame(results(dds, alpha = 0.05, contrast = c("condition", "experimental", "control")))
mir.dif$logP <- -1 * log10(mir.dif$padj)
mir.dif$miRNA <- rownames(mir.dif)
mirna.counts.DGE <- merge(mirna.counts, mir.dif, by = "miRNA", all = TRUE)
setwd("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/Datafiles")
saveRDS(mirna.counts.DGE, "mirna-counts-deseq-09282019.rds")
mirna.info <- merge(mirna.info, mirna.counts[, .(MiRBase.ID = miRNA,
HVA_1_miR, HVA_2_miR, HVA_3_miR, HVA_4_miR, HVA_5_miR,
HVA_6_miR, HVAK_1_miR, HVAK_2_miR, HVAK_3_miR, HVAK_4_miR,
HVAK_5_miR, HVAK_6_miR)],
by = "MiRBase.ID", all.x = TRUE)
mirna.info[, HVA_1_miR := ifelse(is.na(HVA_1_miR), 0, HVA_1_miR)]
mirna.info[, HVA_2_miR := ifelse(is.na(HVA_2_miR), 0, HVA_2_miR)]
mirna.info[, HVA_3_miR := ifelse(is.na(HVA_3_miR), 0, HVA_3_miR)]
mirna.info[, HVA_4_miR := ifelse(is.na(HVA_4_miR), 0, HVA_4_miR)]
mirna.info[, HVA_5_miR := ifelse(is.na(HVA_5_miR), 0, HVA_5_miR)]
mirna.info[, HVA_6_miR := ifelse(is.na(HVA_6_miR), 0, HVA_6_miR)]
mirna.info[, HVAK_1_miR := ifelse(is.na(HVAK_1_miR), 0, HVAK_1_miR)]
mirna.info[, HVAK_2_miR := ifelse(is.na(HVAK_2_miR), 0, HVAK_2_miR)]
mirna.info[, HVAK_3_miR := ifelse(is.na(HVAK_3_miR), 0, HVAK_3_miR)]
mirna.info[, HVAK_4_miR := ifelse(is.na(HVAK_4_miR), 0, HVAK_4_miR)]
mirna.info[, HVAK_5_miR := ifelse(is.na(HVAK_5_miR), 0, HVAK_5_miR)]
mirna.info[, HVAK_6_miR := ifelse(is.na(HVAK_6_miR), 0, HVAK_6_miR)]
mirna.info[, HVA_1_family := sum(HVA_1_miR),
by = miR.family]
mirna.info[, HVA_2_family := sum(HVA_2_miR),
by = miR.family]
mirna.info[, HVA_3_family := sum(HVA_3_miR),
by = miR.family]
mirna.info[, HVA_4_family := sum(HVA_4_miR),
by = miR.family]
mirna.info[, HVA_5_family := sum(HVA_5_miR),
by = miR.family]
mirna.info[, HVA_6_family := sum(HVA_6_miR),
by = miR.family]
mirna.info[, HVAK_1_family := sum(HVAK_1_miR),
by = miR.family]
mirna.info[, HVAK_2_family := sum(HVAK_2_miR),
by = miR.family]
mirna.info[, HVAK_3_family := sum(HVAK_3_miR),
by = miR.family]
mirna.info[, HVAK_4_family := sum(HVAK_4_miR),
by = miR.family]
mirna.info[, HVAK_5_family := sum(HVAK_5_miR),
by = miR.family]
mirna.info[, HVAK_6_family := sum(HVAK_6_miR),
by = miR.family]
mirna.counts.family <- mirna.info[, .(miR.family, Seed.m8, Family.Conservation.,
HVA_1_family, HVA_2_family, HVA_3_family, HVA_4_family, HVA_5_family,
HVA_6_family, HVAK_1_family, HVAK_2_family, HVAK_3_family, HVAK_4_family,
HVAK_5_family, HVAK_6_family)]
mirna.counts.family <- mirna.counts.family[!duplicated(mirna.counts.family)]
setwd("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/Datafiles")
saveRDS(mirna.counts.family, "mirna-counts-by-family-09282019.rds")
dds.family <- DESeqDataSetFromMatrix(mirna.counts.family[,4:15],
colData = data.frame(row.names = colnames(mirna.counts.family[,4:15]),
condition = c(rep("HVA",6), rep("HVAK", 6)),
batch = c('1','1','1','2','2','2','1','1','1','2','2','2')),
design = ~ batch + condition)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
rownames(dds.family) <- mirna.counts.family$miR.family
dds.family <- dds.family[rowSums(DESeq2::counts(dds.family))>200]
dds.family <- DESeq(dds.family)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
mirna.family.res <- results(dds.family, alpha = 0.05, contrast = c("condition", "HVAK", "HVA"))
mirna.family.DGE <- as.data.frame(mirna.family.res)
mirna.family.DGE$miR.family <- rownames(mirna.family.DGE)
mirna.family.DGE <- mirna.family.DGE[order(-mirna.family.DGE$log2FoldChange),]
setwd("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/HEAP-CLIP/HEAP_09282019/Analysis/CLIP_Analysis/Data Visualization/Datafiles")
saveRDS(mirna.family.DGE, "mirna-counts-deseq-by-family-09282019.rds")
cal.z.score <- function(x){
(x - mean(x)) / sd(x)
}
mirna.counts.family.norm <- DESeq2::counts(dds.family, normalized = TRUE)
mirna.counts.family.norm.log <- log2(mirna.counts.family.norm + 1)
mirna.counts.family.norm.z <- as.data.frame(t(apply(mirna.counts.family.norm.log, 1, cal.z.score)))
select <- subset(mirna.family.DGE, baseMean > 2000)
select <- select$miR.family
pheatmap(mirna.counts.family.norm.z[select,],
cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE,
border_color = NA, fontsize_row = 8)
pdf("PDF_figure/Pheatmap_family.pdf",
width = 8,
height = 5)
pheatmap(mirna.counts.family.norm.z[select,],
cluster_rows=TRUE, show_rownames=TRUE, cluster_cols=TRUE,
border_color = NA, fontsize_row = 8)
dev.off()
## pdf
## 3
df <- mirna.family.DGE
df$logP <- -log10(mirna.family.DGE$padj)
p3 <- ggplot(data = df, aes(x = log2FoldChange, y = logP, label = miR.family, size = baseMean)) +
geom_point(alpha = 0.6, colour = "#EC469A", shape = 1) +
#geom_point(data = family.highlight, aes(x = log2FoldChange, y = logP), colour = my.colors[1]) +
#geom_point(data = df[mir.highlight[1],], aes(x = log2FoldChange, y = logP), colour = my.colors[9], size = dotsize) +
#geom_point(data = df[df$Row.names %in% mir.highlight[2:6],], aes(x = log2FoldChange, y = logP), colour = my.colors[4], size = dotsize) +
#geom_point(data = df[mir.highlight[7],], aes(x = log2FoldChange, y = logP), colour = my.colors[2], size = dotsize) +
#geom_point(data = df[mir.highlight[8:11],], aes(x = log2FoldChange, y = logP), colour = my.colors[4], size = dotsize) +
#geom_point(data = df[df$Row.names %in% mir.highlight[12:13],], aes(x = log2FoldChange, y = logP), colour = my.colors[6], size = dotsize) +
#geom_point(data = df[mir.highlight[13],], aes(x = log2FoldChange, y = logP), colour = my.colors[4], size = dotsize) +
xlab("Fold change log2 (HVAK / HVA)") +
ylab("-log10(FDR)") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_text(size=12, margin = margin(t = 10)),
axis.title.y = element_text(size=12, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_line(size = 0.5),
axis.line.x = element_line(size = 0.5),
axis.ticks.x = element_line(size = 0),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold")) + xlim(-4,3)
p4<- ggplotly(p3)
p4
pdf("PDF_figure/VolcanoPlot_expression.pdf",
width = 5,
height = 4)
p3
## Warning: Removed 2 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2